Proteomics: data analysis
Prerequisites
Load the packages needed for the analyses and read-in the relavant data
library(scales)
library(data.table)
library(vctrs)
library(tidyverse)
library(plotly)
library(TissueEnrich)
library(ggrepel)
setwd("/work/LAS/geetu-lab/arnstrm/mouse.trophoblast.smallRNAseq")
dat <-
read.csv("assets/data_processed.csv",
row.names = NULL,
stringsAsFactors = TRUE)
orig_data <-
read.csv("assets/orig_data.csv",
row.names = NULL,
stringsAsFactors = TRUE)
source("assets/theme_clean.R")quick check:
dat_tidy <- dat %>% drop_na()
dat %>% summarise(Number_of_proteins = n())
#> Number_of_proteins
#> 1 174Volcano plot
df <- orig_data
exp <- log(1.5, 2)
df$log2fc <- log(df$Ratio, base = 2)
df$negLog10p = -log10(df$Qvalue)
df$diffexpressed[df$log2fc <= -exp &
df$Qvalue <= 0.05] <- "up in Diff"
df$diffexpressed[df$log2fc >= exp &
df$Qvalue <= 0.05] <- "up in Undiff"
df$diffexpressed[df$log2fc >= -exp &
df$log2fc <= exp] <- "other genes"
df$diffexpressed[df$Qvalue > 0.05] <- "other genes"
df$newGenes <-
str_replace_all(string = df$Genes,
pattern = "\\;.*$",
replacement = "")
data.table::setDT(df)[diffexpressed == "other genes", newGenes := NA]
g <-
ggplot(data = df,
aes(
x = log2fc,
y = negLog10p,
col = diffexpressed,
label = newGenes
)) +
geom_point(alpha = 0.5) +
theme_classic() +
scale_color_manual(name = "Expression",
values = c("grey", "#c6007b", "#a0b600")) +
ggtitle(paste("Diff vs. Undiff (proteomics)")) +
# geom_text_repel(show.legend = F) +
xlab("Log2 fold change") +
ylab("-log10 pvalue") +
theme(legend.text.align = 0)
ggplotly(g)Figure 1: Volcano plot (proteomics)
#ggsave("prot_volc.png", dpi=900, width = 8, height = 6)PCA plot
This was run on another R version due to incompatibility with existing packages. The command is shown below, but it is not executed when knitting the document.
library("MetaboAnalystR")
library(ggplot2)
library(ggrepel)
setwd("~/metabo")
mSet <- InitDataObjects("conc", "stat", FALSE)
mSet <- Read.TextData(mSet, "data_processed.csv", "colu", "disc")
mSet <- SanityCheckData(mSet)
mSet <- RemoveMissingPercent(mSet, percent = 0.5)
mSet <- ImputeMissingVar(mSet, method = "min")
mSet <- PreparePrenormData(mSet)
mSet <-
Normalization(mSet,
"NULL",
"LogNorm",
"NULL",
ratio = FALSE,
ratioNum = 20)
mSet <-
PlotSampleNormSummary(mSet, "snorm_0_", "png", 300, width = NA)
mSet <- PCA.Anal(mSet)
saveRDS(mSet, "mSet.rds")Read in the saved RDS file from the previous step and plot the PCA for samples.
mSet <- readRDS("assets/mset.rds")
group <- sapply(strsplit(rownames(mSet$analSet$pca$x), "_"), "[", 1)
intgroup.df <- as.data.frame(group)
d <- data.frame(
PC1 = mSet$analSet$pca$x[, 1],
PC2 = mSet$analSet$pca$x[, 2],
intgroup.df,
name = rownames(mSet$analSet$pca$x)
)
g <- ggplot(d, aes(PC1, PC2, color = group)) +
scale_shape_manual(values = 1:10) +
scale_color_manual(values = c('Diff' = '#c6007b',
'Undiff' = '#a0b600')) +
theme_classic() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
geom_text_repel(aes(label = name)) +
xlab(paste("PC1", round(mSet$analSet$pca$variance[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(mSet$analSet$pca$variance[2] * 100, 2), "% variance"))
gFigure 2: PCA plot (proteomics)
Tissue Enrichment
plotTE <- function(inputGenes = gene.list,
myColor = "color") {
gs <-
GeneSet(geneIds = inputGenes,
organism = "Mus Musculus",
geneIdType = SymbolIdentifier())
output <- teEnrichment(inputGenes = gs, rnaSeqDataset = 3)
en.output <-
setNames(data.frame(assay(output[[1]]),
row.names = rowData(output[[1]])[, 1]),
colData(output[[1]])[, 1])
en.output$Tissue <- rownames(en.output)
logp <- -log10(0.05)
en.output <-
mutate(en.output,
significance = ifelse(Log10PValue > logp,
"colored", "nocolor"))
en.output$Sig <- "NA"
ggplot(en.output, aes(reorder(Tissue, Log10PValue),
Log10PValue,
fill = significance)) +
geom_bar(stat = 'identity') +
theme_clean() + ylab("- log10 adj. p-value") + xlab("") +
scale_fill_manual(values = c("colored" = myColor, "nocolor" = "gray")) +
scale_y_continuous(expand = expansion(mult = c(0, .1)),
breaks = scales::pretty_breaks()) +
coord_flip()
}
exp <- log(1.2, 2)
diff <- df$newGenes[df$log2fc <= -exp & df$Qvalue <= 0.05]
undiff <- df$newGenes[df$log2fc >= exp & df$Qvalue <= 0.05]
df.diff <- dat_tidy %>%
rowwise() %>%
mutate(Diff = mean(c(
Diff_rep1, Diff_rep2, Diff_rep3, Diff_rep4, Diff_rep5
))) %>%
dplyr::select(proteinID, Diff) %>%
dplyr::filter(Diff > 0) %>%
ungroup() %>%
mutate(quart = ntile(Diff, 4)) %>%
mutate(decile = ntile(Diff, 10))
df.undiff <- dat_tidy %>%
rowwise() %>%
mutate(Undiff = mean(
c(
Undiff_rep1,
Undiff_rep2,
Undiff_rep3,
Undiff_rep4,
Undiff_rep5
)
)) %>%
dplyr::select(proteinID, Undiff) %>%
dplyr::filter(Undiff > 0) %>%
ungroup() %>%
mutate(quart = ntile(Undiff, 4)) %>%
mutate(decile = ntile(Undiff, 10))
filterCuts <- function(dataIn = df.undiff,
cutOff = 4,
type = decile) {
type <- enquo(type)
dataIn %>%
dplyr::filter(!!type == cutOff) %>%
dplyr::select(proteinID)
}
undiff.75pc <- filterCuts(df.undiff, 4, type = quart)
diff.75pc <- filterCuts(df.diff, 4, type = quart)
undiff.90pc <- filterCuts(df.undiff, 10, type = decile)
diff.90pc <- filterCuts(df.diff, 10, type = decile)Enrichment plots (TissueEnrich)
Diff DE
plotTE(unique(diff), myColor = "#c6007b")Figure 3A: Differentially expressed proteins (up-regulated in Diff) enrichment in Mouse ENCODE data
Undiff DE
plotTE(unique(undiff), myColor = "#a0b600")Figure 3B: Differentially expressed proteins (up-regulated in Undiff) enrichment in Mouse ENCODE data
Diff 75th pc
plotTE(as.character(diff.75pc$proteinID), "#c6007b")Figure 3C: Top quartile proteins (Diff) enrichment in Mouse ENCODE data
Diff 90th pc
plotTE(as.character(diff.90pc$proteinID), "#c6007b")Figure 3D: Top decile proteins (Diff) enrichment in Mouse ENCODE data
Undiff 75th pc
plotTE(as.character(undiff.75pc$proteinID), "#a0b600")Figure 3E: Top quartile proteins (Undiff) enrichment in Mouse ENCODE data
Undiff 90th pc
plotTE(as.character(undiff.90pc$proteinID), "#a0b600")Figure 3F: Top decile proteins (Diff) enrichment in Mouse ENCODE data